Data was downloaded from kaggle.com.
packs<-c("tidyverse","caret","rmarkdown","rgeos",
"rnaturalearth","rnaturalearthdata","sf",
"ggspatial","maps","plotly")
invisible(sapply(packs,require,character=T))
df<-read.csv("covid-19-all.csv")
#### Subset only countries in the Continental United States
us<-df[df$Country.Region == "US" & df$Latitude > 24 & df$Latitude < 54 &
df$Longitude < -62 & df$Longitude > -126,]
####Free memory by deleting the unused dataframe
rm(df)
####Create a basic plot of the counties in the US
plot(us$Longitude,us$Latitude)
This is the basic shape of our data. It would look better with color indicating the number of cases and overlayed on an actual plot of the United States.
####Create a new vector for the ranges of confirmed cases data
casesRange<-list(c(seq(1,49,1)),c(seq(50,99,1)),c(seq(100,199,1)),c(seq(200,499,1)),
c(seq(500,999,1)),c(seq(1000,1999,1)),c(seq(2000,9999,1)),c(0))
us$ConfirmedRange<-NA
us$ConfirmedRange[us$Confirmed %in% casesRange[[8]]]<-"0"
us$ConfirmedRange[us$Confirmed %in% casesRange[[1]]]<-"1-49"
us$ConfirmedRange[us$Confirmed %in% casesRange[[2]]]<-"50-99"
us$ConfirmedRange[us$Confirmed %in% casesRange[[3]]]<-"100-199"
us$ConfirmedRange[us$Confirmed %in% casesRange[[4]]]<-"200-499"
us$ConfirmedRange[us$Confirmed %in% casesRange[[5]]]<-"500-999"
us$ConfirmedRange[us$Confirmed %in% casesRange[[6]]]<-"1,000-1,999"
us$ConfirmedRange[us$Confirmed %in% casesRange[[7]]]<-"2,000-9,999"
us$ConfirmedRange[us$Confirmed >= 10000]<-"10,000+"
us$ConfirmedRange<-as.factor(us$ConfirmedRange)
####Reorder the factor levels from least to greatest
us$ConfirmedRange<-factor(us$ConfirmedRange,levels = levels(us$ConfirmedRange)[c(1,2,8,5,7,9,3,6,4)])
####Plot the confirmed cases by their American Latitude/Longitude
colors<-c(rgb(127,255,0,maxColorValue = 255),rgb(34,139,34,maxColorValue = 255),rgb(0,100,0,maxColorValue = 255),
rgb(255,255,0,maxColorValue = 255),rgb(255,165,0,maxColorValue = 255),rgb(255,0,0,maxColorValue = 255),
rgb(139,0,0,maxColorValue = 255),rgb(128,0,128,maxColorValue = 255),rgb(0,0,0,maxColorValue = 255))
world <- ne_countries(scale = "medium", returnclass = "sf")
states <- st_as_sf(map("state", plot = FALSE, fill = TRUE))
states <- cbind(states, st_coordinates(st_centroid(states)))
ggplot(data = world)+
geom_sf()+
geom_sf(data = states, fill = NA)+
coord_sf(xlim = c(-126, -70), ylim = c(24, 50), expand = FALSE)+
geom_point(data = us, shape = 15, aes(x = Longitude, y = Latitude, group = Province.State,
color = ConfirmedRange), size = 1)+
scale_color_manual(values = colors)+
labs(x = "Latitude", y = "Longitude", title = "Confirmed Cases by State")+
theme_void()
Next, we can look at the number of deaths per report in each state.
####We can reuse the code from before changing it to count deaths instead in a new column
us$Deaths[is.na(us$Deaths)]<-0
us$DeathsRange<-NA
us$DeathsRange[us$Deaths %in% casesRange[[8]]]<-"0"
us$DeathsRange[us$Deaths %in% casesRange[[1]]]<-"1-49"
us$DeathsRange[us$Deaths %in% casesRange[[2]]]<-"50-99"
us$DeathsRange[us$Deaths %in% casesRange[[3]]]<-"100-199"
us$DeathsRange[us$Deaths %in% casesRange[[4]]]<-"200-499"
us$DeathsRange[us$Deaths %in% casesRange[[5]]]<-"500-999"
us$DeathsRange[us$Deaths %in% casesRange[[6]]]<-"1,000-1,999"
us$DeathsRange[us$Deaths %in% casesRange[[7]]]<-"2,000-9,999"
us$DeathsRange[us$Deaths >= 10000]<-"10,000+"
us$DeathsRange<-as.factor(us$DeathsRange)
us$DeathsRange<-factor(us$DeathsRange,levels = levels(us$DeathsRange)[c(1,2,8,5,7,9,3,6,4)])
####We can use largely the same plot code from before to plot the deaths
ggplot(data = world)+
geom_sf()+
geom_sf(data = states, fill = NA)+
coord_sf(xlim = c(-126, -70), ylim = c(24, 50), expand = FALSE)+
geom_point(data = us, shape = 15, aes(x = Longitude, y = Latitude, group = Province.State,
color = DeathsRange), size = 1)+
scale_color_manual(values = colors)+
labs(x = "Latitude", y = "Longitude", title = "Deaths by State")+
theme_void()
When comparing the cases to deaths, most of the same areas light up, but at a lower rate than expected
Going beyond static images, We can also use plotly to generate an interactive plot that supports zooming and subsetting of data to show multiple points at once.
####First, we need to create unique IDs for cities that have reported multiple times.
us$Loc<-paste(us$Latitude,us$Longitude)
us$Loc<-as.factor(us$Loc)
####Then we take only the highest number for cumulative confirmed, deaths, and recovered.
us$Date<-as.Date(us$Date)
usg<- us %>%
arrange(desc(Date)) %>%
group_by(Loc) %>%
top_n(1,Date) ####Select only the latest date for each location
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showland = TRUE,
landcolor = toRGB("gray70"),
subunitcolor = toRGB("gray85"),
countrycolor = toRGB("gray85"),
countrywidth = 0.5,
subunitwidth = 0.5
)
fig <- plot_geo(usg, lat = ~Latitude, lon = ~Longitude, colors = colors)
fig <- fig %>% add_markers(
text = ~paste(Province.State,paste("Confirmed",Confirmed),paste("Deaths",Deaths),paste("Recovered",Recovered), sep = "<br />"),
color = ~ConfirmedRange,
symbol = I("square"),
size = I(8),
hoverinfo = "text",
)
fig <- fig %>% layout(
title = 'Coronavirus in the United States<br />(Hover for totals)', geo = g
)
##This interactive plotly figure displays the total confirmed cases, deaths, and recovered in the United States.
##The plot can be dragged and zoomed and data can be hidden by severity group.
fig